#!/opt/lampp/bin/perl -w
use warnings;
use strict;
use DBI;
use Encode::Guess qw(gbk gb2312 utf8 euc-cn cp936);
use Encode;
require('config.pl');
require('function.pl');

my $db_conn = DBI->connect("DBI:mysql:database=zx110;host=$ENV{'db_host'}", $ENV{'db_user'}, $ENV{'db_password'});
$db_conn->do('set names utf8');

my $page = 1;
my @indexes = (0 ... 10);
push @indexes, ('A' ... 'Z');
my ($sec,$min,$hour,$mday,$mon,$year_off,$wday,$yday,$isdat) = localtime;
my $index = $indexes[($year_off * 365 + $yday) % 36];
while (1) {
	my $list_url = "http://pinggu.zx110.org/sites_${index}_${page}";
	my $list_html = `curl -s -S --retry 10 --connect-timeout 10 -m 10 '$list_url'`;
	while ($list_html =~ /<a target="_blank" href=\/review_url_(.+?)>/g) {
		my $retry = 0;
		while (1) {
			my $domain = $1;
#				$domain = '0-130.com';
#				next if (execute_scalar("select count(*) from domain_info where domain = '$domain'", $db_conn) > 0);
			my $detail_url = "http://pinggu.zx110.org/review_url_$1";
			my $detail_html = `curl -s -S --retry 10 --connect-timeout 10 -m 10 '$detail_url'`;
			if ($detail_html =~ /是一个无效域名或无法访问/) {
				print "$domain invalid domain\n";
				last;
			}
			my $icp_license = ($detail_html =~ /网站备案\/许可证号：<\/td>\s*<td>([\d\D]*?)\s*<\/td>/) ? $1 : '';
			my $icp_site_name = ($detail_html =~ /<td>网站名称：<\/td>\s*<td valign="top">\s*([\d\D]*?)\s*<\/td>/) ? $1 : '';
			my $icp_home_page = ($detail_html =~ /<span style='float:left;display: block;'>\s*(.*?)\s*<\/span>/) ? $1 : '';
			my $icp_review_date = ($detail_html =~ /<td>审核时间：<\/td>\s*<td>\s*([\d\-]*?)\s*<\/td>/) ? $1 : '';
			my $icp_owner = ($detail_html =~ /<td>主办单位名称：<\/td>\s*<td>\s*([\d\D]*?)\s*<\/td>/) ? $1 : '';
			my $icp_type = ($detail_html =~ /<td>主办单位性质：<\/td>\s*<td>\s*([\d\D]*?)\s*<\/td>/) ? $1 : '';
			my $ic_company_name = ($detail_html =~ /<td width="120">公司名称：<\/td>\s*<td>\s*([\d\D]*?)\s*<\/td>/) ? $1 : '';
			my $ic_responsible_persion = ($detail_html =~ /<td>负责人：<\/td>\s*<td>\s*([\d\D]*?)\s*<\/td>/) ? $1 : '';
			my $ic_registered_capital = ($detail_html =~ /<td>注册资金：<\/td>\s*<td>\s*([\d\D]*?)\s*<\/td>/) ? $1 : '';
			my $ic_registration_no = ($detail_html =~ /<td>公司注册号：<\/td>\s*<td>\s*([\d\D]*?)\s*<\/td>/) ? $1 : '';
			my $ic_company_address = ($detail_html =~ /<td>公司注册地址：<\/td>\s*<td>\s*([\d\D]*?)\s*<\/td>/) ? $1 : '';
			my $ic_company_type = ($detail_html =~ /<td>企业类型：<\/td>\s*<td>\s*([\d\D]*?)\s*<\/td>/) ? $1 : '';
			my $ic_business_scope = ($detail_html =~ /<td valign="top">经营范围：<\/td>\s*<td style="word-break:break-all;">\s*([\d\D]*?)\s*<\/td>/) ? $1 : '';
			my $star = ($detail_html =~ /var starNum = '([\d\.]+)';/) ? $1 : -1;
			my $homepage_index = `curl -s -S -L --connect-timeout 10 -m 10 'http://$domain/'`;
			my $encoding = Encode::Guess->guess($homepage_index);
			$encoding = $encoding->name if (ref($encoding));
			if ($encoding =~ /euc-cn/ || $encoding =~ /cp936/) {
				my $decoded = decode('cp936', $homepage_index);
				$homepage_index = encode('utf8', $decoded);
			}
			my $title = ($homepage_index =~ /<title>\s*([\d\D]*?)\s*<\/title>/) ? $1 : '';
			my $keywords = ($homepage_index =~ /<meta\s+name="keywords"\s+content="\s*([\d\D]*?)\s*"/) ? $1 : '';
			my $description = ($homepage_index =~ /<meta\s+name="description"\s+content="\s*([\d\D]*?)\s*"/) ? $1 : '';
			if ($star < 0) {
				print STDERR "$domain star invalid\n";
				if (++$retry < 3) {
					next;
				}
				else {
					last;
				}
			}
			print "$domain|$icp_license|$icp_site_name|$icp_home_page|$icp_review_date|$icp_owner|$icp_type|$ic_company_name|$ic_responsible_persion|$ic_registered_capital|$ic_registration_no|$ic_company_address|$ic_company_type|$ic_business_scope|$star|$title|$keywords|$description\n";
			$title = $db_conn->quote($title);
			$keywords = $db_conn->quote($keywords);
			$description = $db_conn->quote($description);
			$homepage_index = $db_conn->quote($homepage_index);
			$db_conn->do("replace into domain_info(domain, icp_license, icp_site_name, icp_home_page, icp_review_date, icp_owner, icp_type, ic_company_name, ic_responsible_persion, ic_registered_capital, ic_registration_no, ic_company_address, ic_company_type, ic_business_scope, star, web_homepage_title, web_homepage_keywords, web_homepage_description, web_homepage_index) values('$domain', '$icp_license', '$icp_site_name', '$icp_home_page', '$icp_review_date', '$icp_owner', '$icp_type', '$ic_company_name', '$ic_responsible_persion', '$ic_registered_capital', '$ic_registration_no', '$ic_company_address', '$ic_company_type', '$ic_business_scope', $star, $title, $keywords, $description, $homepage_index)");
#				print "replace into domain_info values('$domain', '$icp_license', '$icp_site_name', '$icp_home_page', '$icp_review_date', '$icp_owner', '$icp_type', '$ic_company_name', '$ic_responsible_persion', '$ic_registered_capital', '$ic_registration_no', '$ic_company_address', '$ic_company_type', '$ic_business_scope')\n";
			last;
		}
		if (index($list_html, '下一页') < 0) {
			last;
		}
	}
	++$page;
}
print "$ARGV[0] $ARGV[1] done\n";
